Unittest the whitespace stripping logic for tox text generation

The separate class allows to unittest this functionality in isoloation.

Change-Id: I1e5eddfb455ca85a662ea38c03302883decc5d58
Reviewed-on: https://gerrit.libreoffice.org/9608
Tested-by: Caolán McNamara <caolanm@redhat.com>
Reviewed-by: Caolán McNamara <caolanm@redhat.com>
diff --git a/sw/CppunitTest_sw_tox.mk b/sw/CppunitTest_sw_tox.mk
new file mode 100644
index 0000000..f372442
--- /dev/null
+++ b/sw/CppunitTest_sw_tox.mk
@@ -0,0 +1,50 @@
# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*-
#
# This file is part of the LibreOffice project.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#

# This file contains the unit test definition for class in the sources/core/tox subfolder
# The macro which defines the main method is contained in test_ToxWhitespaceStripper.cxx

$(eval $(call gb_CppunitTest_CppunitTest,sw_tox_test))

$(eval $(call gb_CppunitTest_add_exception_objects,sw_tox_test, \
	sw/qa/cppunit/tox/test_ToxWhitespaceStripper \
))

$(eval $(call gb_CppunitTest_use_libraries,sw_tox_test, \
	comphelper \
	cppu \
	cppuhelper \
	sal \
	svt \
	sw \
	test \
	unotest \
	vcl \
	tl \
	utl \
	$(gb_UWINAPI) \
))

$(eval $(call gb_CppunitTest_use_externals,sw_tox_test, \
	boost_headers \
	libxml2 \
))

$(eval $(call gb_CppunitTest_use_api,sw_tox_test,\
	offapi \
	udkapi \
))

$(eval $(call gb_CppunitTest_set_include,sw_tox_test,\
    -I$(SRCDIR)/sw/inc \
    -I$(SRCDIR)/sw/source/core/inc \
    $$(INCLUDE) \
))

# vim: set noet sw=4 ts=4:
diff --git a/sw/Library_sw.mk b/sw/Library_sw.mk
index c9804a0..a7500ad 100644
--- a/sw/Library_sw.mk
+++ b/sw/Library_sw.mk
@@ -389,6 +389,7 @@
    sw/source/core/tox/toxhlp \
    sw/source/core/tox/txmsrt \
    sw/source/core/tox/ToxTextGenerator \
    sw/source/core/tox/ToxWhitespaceStripper \
    sw/source/core/txtnode/SwGrammarContact \
    sw/source/core/txtnode/atrfld \
    sw/source/core/txtnode/atrflyin \
diff --git a/sw/Module_sw.mk b/sw/Module_sw.mk
index 53a19bb..7e71e2c 100644
--- a/sw/Module_sw.mk
+++ b/sw/Module_sw.mk
@@ -46,6 +46,10 @@

endif

$(eval $(call gb_Module_add_check_targets,sw,\
    CppunitTest_sw_tox \
))

$(eval $(call gb_Module_add_slowcheck_targets,sw,\
    CppunitTest_sw_uwriter \
    CppunitTest_sw_htmlexport \
diff --git a/sw/inc/ToxWhitespaceStripper.hxx b/sw/inc/ToxWhitespaceStripper.hxx
new file mode 100644
index 0000000..96b254b
--- /dev/null
+++ b/sw/inc/ToxWhitespaceStripper.hxx
@@ -0,0 +1,52 @@
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
 * This file is part of the LibreOffice project.
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 */


#ifndef TOXWHITESPACESTRIPPER_HXX_
#define TOXWHITESPACESTRIPPER_HXX_

#include "rtl/ustring.hxx"

#include <vector>

namespace sw {

/** This class helps to remove unwanted whitespaces from a string to use in a Tox.
 *
 * The new string will have
 * - Newlines changed to spaces
 * - Consecutive spaces merged
 * - Trailing spaces removed
 *
 * It also allows to find the corresponding new positions of the input string in the stripped string.
 * This is important for attributes which might have to be imported, e.g., it helps to answer the question:
 * The 3rd character of the input string is subscript, which character in the output string is that?
 *
 * @note One leading whitespace is preserved.
 */
class SAL_DLLPUBLIC ToxWhitespaceStripper {
public:
    ToxWhitespaceStripper(const OUString&);

    sal_Int32
    GetPositionInStrippedString(sal_Int32 pos) const;

    OUString
    GetStrippedString() const;

private:
    OUString mStripped;
    std::vector<sal_Int32> mNewPositions;
};

} // end namespace sw



#endif /* TOXWHITESPACESTRIPPER_HXX_ */
diff --git a/sw/qa/cppunit/tox/test_ToxWhitespaceStripper.cxx b/sw/qa/cppunit/tox/test_ToxWhitespaceStripper.cxx
new file mode 100644
index 0000000..fdbd47c
--- /dev/null
+++ b/sw/qa/cppunit/tox/test_ToxWhitespaceStripper.cxx
@@ -0,0 +1,150 @@
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
 * This file is part of the LibreOffice project.
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 */

#include <stdexcept>

#include <sal/types.h>

#include <rtl/ustring.hxx>

#include <ToxWhitespaceStripper.hxx>

#include <cppunit/TestAssert.h>
#include <cppunit/TestFixture.h>
#include <cppunit/extensions/HelperMacros.h>
#include <cppunit/plugin/TestPlugIn.h>

using namespace sw;

class ToxWhitespaceStripperTest : public CppUnit::TestFixture
{
    void
    MappingCharactersToVariousStrippedStringsWorks();

    void
    StrippingWhitespacesFromVariousStringsWorks();

    void
    PositionAfterStringCanBeRequested();

    CPPUNIT_TEST_SUITE(ToxWhitespaceStripperTest);
    CPPUNIT_TEST(MappingCharactersToVariousStrippedStringsWorks);
    CPPUNIT_TEST(StrippingWhitespacesFromVariousStringsWorks);
    CPPUNIT_TEST(PositionAfterStringCanBeRequested);

    CPPUNIT_TEST_SUITE_END();

};

void
ToxWhitespaceStripperTest::MappingCharactersToVariousStrippedStringsWorks()
{
    {
        OUString test("abc\n");
        ToxWhitespaceStripper sut(test);
        CPPUNIT_ASSERT_EQUAL(0, sut.GetPositionInStrippedString(0));
        CPPUNIT_ASSERT_EQUAL(1, sut.GetPositionInStrippedString(1));
        CPPUNIT_ASSERT_EQUAL(2, sut.GetPositionInStrippedString(2));
        CPPUNIT_ASSERT_EQUAL(3, sut.GetPositionInStrippedString(3));
    }
    {
        OUString test("abc\n\n");
        ToxWhitespaceStripper sut(test);
        CPPUNIT_ASSERT_EQUAL(0, sut.GetPositionInStrippedString(0));
        CPPUNIT_ASSERT_EQUAL(1, sut.GetPositionInStrippedString(1));
        CPPUNIT_ASSERT_EQUAL(2, sut.GetPositionInStrippedString(2));
        CPPUNIT_ASSERT_EQUAL(3, sut.GetPositionInStrippedString(3));
        CPPUNIT_ASSERT_EQUAL(3, sut.GetPositionInStrippedString(4));
    }
    {
        OUString test("abc\ndef");
        ToxWhitespaceStripper sut(test);
        CPPUNIT_ASSERT_EQUAL(0, sut.GetPositionInStrippedString(0));
        CPPUNIT_ASSERT_EQUAL(1, sut.GetPositionInStrippedString(1));
        CPPUNIT_ASSERT_EQUAL(2, sut.GetPositionInStrippedString(2));
        CPPUNIT_ASSERT_EQUAL(3, sut.GetPositionInStrippedString(3));
        CPPUNIT_ASSERT_EQUAL(4, sut.GetPositionInStrippedString(4));
        CPPUNIT_ASSERT_EQUAL(5, sut.GetPositionInStrippedString(5));
        CPPUNIT_ASSERT_EQUAL(6, sut.GetPositionInStrippedString(6));
    }
    {
        //             012345 6789
        OUString test("  abc \ndef");
        //             01234567
        //            " abc def"
        ToxWhitespaceStripper sut(test);
        CPPUNIT_ASSERT_EQUAL(0, sut.GetPositionInStrippedString(0));
        CPPUNIT_ASSERT_EQUAL(0, sut.GetPositionInStrippedString(1));
        CPPUNIT_ASSERT_EQUAL(1, sut.GetPositionInStrippedString(2));
        CPPUNIT_ASSERT_EQUAL(2, sut.GetPositionInStrippedString(3));
        CPPUNIT_ASSERT_EQUAL(3, sut.GetPositionInStrippedString(4));
        CPPUNIT_ASSERT_EQUAL(4, sut.GetPositionInStrippedString(5));
        CPPUNIT_ASSERT_EQUAL(4, sut.GetPositionInStrippedString(6));
        CPPUNIT_ASSERT_EQUAL(5, sut.GetPositionInStrippedString(7));
        CPPUNIT_ASSERT_EQUAL(6, sut.GetPositionInStrippedString(8));
        CPPUNIT_ASSERT_EQUAL(7, sut.GetPositionInStrippedString(9));
    }
}

void
ToxWhitespaceStripperTest::StrippingWhitespacesFromVariousStringsWorks()
{
    {
        OUString test("abc\n");
        OUString expected("abc");
        ToxWhitespaceStripper sut(test);
        CPPUNIT_ASSERT_EQUAL(expected, sut.GetStrippedString());
    }
    {
        OUString test("abc\n\n");
        OUString expected("abc");
        ToxWhitespaceStripper sut(test);
        CPPUNIT_ASSERT_EQUAL(expected, sut.GetStrippedString());
    }
    {
        OUString test("abc\ndef");
        OUString expected("abc def");
        ToxWhitespaceStripper sut(test);
        CPPUNIT_ASSERT_EQUAL(expected, sut.GetStrippedString());
    }
    {
        OUString test("  abc \ndef");
        OUString expected(" abc def");
        ToxWhitespaceStripper sut(test);
        CPPUNIT_ASSERT_EQUAL(expected, sut.GetStrippedString());
    }
    {
        OUString test("  ");
        OUString expected("");
        ToxWhitespaceStripper sut(test);
        CPPUNIT_ASSERT_EQUAL(expected, sut.GetStrippedString());
    }
    {
        OUString test("d  ");
        OUString expected("d");
        ToxWhitespaceStripper sut(test);
        CPPUNIT_ASSERT_EQUAL(expected, sut.GetStrippedString());
    }
}

void
ToxWhitespaceStripperTest::PositionAfterStringCanBeRequested()
{
    OUString test("abc");
    ToxWhitespaceStripper sut(test);
    sal_Int32 expected = test.getLength();
    CPPUNIT_ASSERT_EQUAL(expected, sut.GetPositionInStrippedString(test.getLength()));
}

// Put the test suite in the registry
CPPUNIT_TEST_SUITE_REGISTRATION(ToxWhitespaceStripperTest);

CPPUNIT_PLUGIN_IMPLEMENT();

/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/sw/source/core/tox/ToxTextGenerator.cxx b/sw/source/core/tox/ToxTextGenerator.cxx
index 54d7b6a..8554c88 100644
--- a/sw/source/core/tox/ToxTextGenerator.cxx
+++ b/sw/source/core/tox/ToxTextGenerator.cxx
@@ -33,6 +33,7 @@
#include "fmtpdsc.hxx"
#include "DocumentSettingManager.hxx"
#include "SwStyleNameMapper.hxx"
#include "ToxWhitespaceStripper.hxx"

#include "editeng/tstpitem.hxx"
#include "editeng/lrspitem.hxx"
@@ -49,32 +50,6 @@
        nEndTextPos(nEnd) {}
};

/// Generate String with newlines changed to spaces, consecutive spaces changed
/// to a single space, and trailing space removed.
OUString lcl_RemoveLineBreaks(const OUString &rRet)
{
    if (rRet.isEmpty())
        return rRet;
    sal_Int32 nOffset = 0;
    OUStringBuffer sRet(rRet.replace('\n', ' '));
    for (sal_Int32 i = 1; i < sRet.getLength(); ++i)
    {
        if ( sRet[i - 1] == ' ' && sRet[i] == ' ' )
        {
            nOffset += 1;
        }
        else
        {
            sRet[i - nOffset] = sRet[i];
        }
    }
    if (sRet[sRet.getLength() - 1] == ' ')
    {
        nOffset += 1;
    }
    return sRet.copy(0, sRet.getLength() - nOffset).toString();
}

/// Generate String according to the Form and remove the
/// special characters 0-31 and 255.
static OUString lcl_GetNumString( const SwTOXSortTabBase& rBase, bool bUsePrefix, sal_uInt8 nLevel )
@@ -144,8 +119,8 @@
            case TOKEN_ENTRY_TEXT:
                {
                    SwIndex aIdx( pTOXNd, std::min(pTOXNd->GetTxt().getLength(),rTxt.getLength()) );
                    rBase.FillText( *pTOXNd, aIdx );
                    rTxt = lcl_RemoveLineBreaks(rTxt);
                    ToxWhitespaceStripper stripper(rBase.GetTxt().sText);
                    pTOXNd->InsertText(stripper.GetStrippedString(), aIdx);
                }
                break;

@@ -153,10 +128,9 @@
                {
                    // for TOC numbering
                    rTxt += lcl_GetNumString( rBase, true, MAXLEVEL );

                    SwIndex aIdx( pTOXNd, rTxt.getLength() );
                    rBase.FillText( *pTOXNd, aIdx );
                    rTxt = lcl_RemoveLineBreaks(rTxt);
                    ToxWhitespaceStripper stripper(rBase.GetTxt().sText);
                    pTOXNd->InsertText(stripper.GetStrippedString(), aIdx);
                }
                break;

diff --git a/sw/source/core/tox/ToxWhitespaceStripper.cxx b/sw/source/core/tox/ToxWhitespaceStripper.cxx
new file mode 100644
index 0000000..b01c92c
--- /dev/null
+++ b/sw/source/core/tox/ToxWhitespaceStripper.cxx
@@ -0,0 +1,62 @@
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
 * This file is part of the LibreOffice project.
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 */

#include "ToxWhitespaceStripper.hxx"

#include "rtl/ustrbuf.hxx"
#include <boost/numeric/conversion/cast.hpp>

namespace sw {

ToxWhitespaceStripper::ToxWhitespaceStripper(const OUString& inputString)
{
    OUStringBuffer buffer;

    bool lastCharacterWasWhitespace = false;
    for (sal_Int32 pos = 0; pos < inputString.getLength(); ++pos) {
        sal_Unicode cur = inputString[pos];

        if (cur == ' ' || cur == '\n') {
            // merge consecutive whitespaces (and translate them to spaces)
            if (!lastCharacterWasWhitespace) {
                buffer.append(' ');
            }
            lastCharacterWasWhitespace = true;
        }
        else {
            buffer.append(cur);
            lastCharacterWasWhitespace = false;
        }
        mNewPositions.push_back(buffer.getLength()-1);
    }
    // Add one position if the position after the stripped string is requested, e.g., for attributes which
    // extend beyond the string.
    mNewPositions.push_back(buffer.getLength());
    // strip the last whitespace (if there was one)
    if (lastCharacterWasWhitespace) {
        buffer.truncate(buffer.getLength() - 1);
    }
    mStripped = buffer.getStr();
}


sal_Int32
ToxWhitespaceStripper::GetPositionInStrippedString(sal_Int32 pos) const
{
    size_t upos = boost::numeric_cast<size_t>(pos);
    return mNewPositions.at(upos);
}

OUString
ToxWhitespaceStripper::GetStrippedString() const
{
    return mStripped;
}

}